##########################################
### Intro to Regression
### Samantha Zuhlke
###
### This R script reviews: 
### - correlations, scatterplots, trend lines
### - introduces multivariate regression
##########################################

getwd()
setwd("Insert file path")
getwd()
# Two reminders: 
##### everyone's working directory will be different.
##### As a reminder, every R script needs a preamble.

# Note, you may want to use the "here" package with students to set workspaces, 
## which makes setting workspaces much easier. 
##########################################

# clear the environment
rm(list=ls())

# Note, it's not best practice to begin your R script with rm(list=ls()), 
#   because it only clears objects in your Global Environment and not other settings.
# It's better to start a new R session when you run a new script.
# However, I find that impractical when teaching multiple tutorials in a lab, 
#   and so use rm(list=ls()) in tutorials when moving between scripts.
# If you do the same, be sure to flag this for your students that it isn't best practice!

##########################################
# Review
# Import data 
diamonds <- diamonds %>% sample_n(1000)
# Or
# data <- diamonds if the dplyr package is not working

# examine our data and variables of interest: carat and price.
summary(diamonds)
summary(diamonds$price)

hist(diamonds$carat)
hist(diamonds$price)



##########################################
# Correlations, Scatterplots, Trend lines

# let's say we are interested in the relationship between size and price

# summary statistics 
summary(diamonds$carat)
sd(diamonds$carat)
hist(diamonds$carat)
qplot(x=carat, data=diamonds)

summary(diamonds$price)
sd(diamonds$price)
hist(diamonds$price)
qplot(x=price, data=diamonds)

# t tests
# a type of bivariate analysis
# t tests compare the means of groups. are the means significantly differnet? 

# independent 2-group t-test
# t.test(y,x) # where both variables are numeric (i.e. interval)
t.test(diamonds$price, diamonds$carat)
# NULL == "no meaningful difference in means"
# Do we reject or fail to reject the null?

# but you need to pay attention to the measurement of the variables
# independent 2-group t-test
# t.test(y~x) # where y is numeric and x is a binary factor
qplot(x=cut, data=diamonds)

# mean by groups
# summary statistics by group
# install.packages("psych")
library(psych)
describeBy(diamonds, diamonds$cut)
describeBy(diamonds$price, diamonds$cut)

# create two categories: fair and good - ideal.
# do fair diamonds have a different mean price then other cuts? 
diamonds$cutcat <- as.factor(ifelse(diamonds$cut == "Fair", c("0"), c("1"))) 
# View(diamonds)
summary(diamonds$cutcat)

t.test(diamonds$price ~ diamonds$cutcat)

# Scatterplot 
# The command for a scatterplot is plot(INDEPENDENT VARIABLE, DEPENDENT VARIABLE)
plot(diamonds$carat,diamonds$price)
# This is the type of graph you needed to make in RS2.
# if we want to add labels
plot(diamonds$carat,diamonds$price,ylab="Price",
     xlab="Carat",main="My Scatterplot!")

# Add a trend line
# OUTCOME ~ PREDICTOR, DATA = DATA NAME
abline(reg = lm(price~carat,data=diamonds),lty=1,lwd=4, 
       col = "red")
# this trend line will become important when we talk about regression

# Correlations
# cor(Y,X)
cor(diamonds$price,diamonds$carat)

# in class question - is this a strong, medium, or weak correlation?
# is this a statistically significant correlation?

# Correlation Tests
# Pearson Corellation (similarity)
# recall that Pearson tests are for comparing interval level data: IV and DV
# y,x
cor.test(diamonds$price,diamonds$carat,method = "pearson")
# NULL = There is no relationship between the variables.
# Do we reject or fail to reject the null? 


#####################################
# Introduction to OLS Regression

# Another word for the "trend line" from above is the "regression line"
# look closely at the code for the trend line:
# abline(reg = lm(price~carat,data=data),lty=1,lwd=4, col = "blue")
# note: lm(price~carat, data=data)

# This is the code for the linear model, aka regression, 
# aka OLS (Ordinary Least Squares)

# The modeling framework in R straightforward:
# takes a formula (any with a tilde, e.g. "y~x")
# takes data

# lm(Dependent Variable ~ Independent Variable, data = DATA SET NAME)
model.1 <- lm(price ~ carat, data=data)


# Call the model. The model output is only so informative - i.e. it doesn't tell us a lot. 
model.1

# can use "summary" function which gives us a lot more information
summary(model.1) 

# review how to read results

# Model Diagonostics, which offer
plot(model.1)
# (1) linearity test,
# (2) normality assump. test, 
# (3) equal variance test (homosk.),
# (4) locating outliers (influencial cases)

# residuals 
residuals <- model.1$residuals
qplot(x=residuals, bins=10)
# how are our residuals/errors distributed?

m1.sum <-summary(model.1)
# Stats
m1.sum$fstatistic
m1.sum$r.squared #R2... higher the better

## Apply to another data set

# Load in the dataset.
fake.data <-read.csv("fake.data.csv")
# open fake.data.xlxs and save it as a csv file using Excel

# Examine the structure of each data frame
str(fake.data)

# View the Data
View(fake.data)

# Examine the data
summary(fake.data)

# In class exercise: 
## calculate the mean of variable, x1

mean(fake.data$X1)

## summarize each variable, independently

summary(fake.data$X1)
summary(fake.data$X2)
summary(fake.data$X3)
summary(fake.data$X4)
summary(fake.data$X5)

## create separate visualizations of X4

qplot(x=X4,data=fake.data)
hist(fake.data$X4)

## create a scatterplot comparing X4 and Y
plot(fake.data$X4, fake.data$Y)

## Is the relationship positive or negative? Add a trend line to find out.
abline(reg = lm(Y~X4,data=fake.data),lty=1,lwd=4, col = "purple")

# Regression: What is the relationship between x4 and y?

model.2 <- lm(Y ~ X4, data=fake.data)
model.2
summary(model.2)

# In class question: is the effect of X4 on Y statistically significant?

# Multiple regression
model.3 <- lm(Y ~ X4 + X1, data=fake.data)
summary(model.3)

# In class question: how do we interpret these results?
plot(fake.data$X4, fake.data$Y)




